In [1]:
import pandas as pd
import numpy as np
Importing Breast Cancer dataset¶
In [2]:
data = pd.read_csv("data.csv")
Basic understanding about Breast Cancer Wisconsin (Diagnostic) DataSet¶
In [43]:
print("Top 10 rows :\n")
data.head(10)
Top 10 rows :
Out[43]:
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 19 | 8510426 | 0 | 13.540 | 14.36 | 87.46 | 566.3 | 0.09779 | 0.08129 | 0.06664 | 0.04781 | ... | 15.11 | 19.26 | 99.70 | 711.2 | 0.14400 | 0.17730 | 0.23900 | 0.12880 | 0.2977 | 0.07259 |
| 20 | 8510653 | 0 | 13.080 | 15.71 | 85.63 | 520.0 | 0.10750 | 0.12700 | 0.04568 | 0.03110 | ... | 14.50 | 20.49 | 96.09 | 630.5 | 0.13120 | 0.27760 | 0.18900 | 0.07283 | 0.3184 | 0.08183 |
| 21 | 8510824 | 0 | 9.504 | 12.44 | 60.34 | 273.9 | 0.10240 | 0.06492 | 0.02956 | 0.02076 | ... | 10.23 | 15.66 | 65.13 | 314.9 | 0.13240 | 0.11480 | 0.08867 | 0.06227 | 0.2450 | 0.07773 |
| 37 | 854941 | 0 | 13.030 | 18.42 | 82.61 | 523.8 | 0.08983 | 0.03766 | 0.02562 | 0.02923 | ... | 13.30 | 22.81 | 84.46 | 545.9 | 0.09701 | 0.04619 | 0.04833 | 0.05013 | 0.1987 | 0.06169 |
| 40 | 855167 | 1 | 13.440 | 21.58 | 86.18 | 563.0 | 0.08162 | 0.06031 | 0.03110 | 0.02031 | ... | 15.93 | 30.25 | 102.50 | 787.9 | 0.10940 | 0.20430 | 0.20850 | 0.11120 | 0.2994 | 0.07146 |
| 43 | 856106 | 1 | 13.280 | 20.28 | 87.32 | 545.2 | 0.10410 | 0.14360 | 0.09847 | 0.06158 | ... | 17.38 | 28.00 | 113.10 | 907.2 | 0.15300 | 0.37240 | 0.36640 | 0.14920 | 0.3739 | 0.10270 |
| 48 | 857155 | 0 | 12.050 | 14.63 | 78.04 | 449.3 | 0.10310 | 0.09092 | 0.06592 | 0.02749 | ... | 13.76 | 20.70 | 89.88 | 582.6 | 0.14940 | 0.21560 | 0.30500 | 0.06548 | 0.2747 | 0.08301 |
| 49 | 857156 | 0 | 13.490 | 22.30 | 86.91 | 561.0 | 0.08752 | 0.07698 | 0.04751 | 0.03384 | ... | 15.15 | 31.82 | 99.00 | 698.8 | 0.11620 | 0.17110 | 0.22820 | 0.12820 | 0.2871 | 0.06917 |
| 50 | 857343 | 0 | 11.760 | 21.60 | 74.72 | 427.9 | 0.08637 | 0.04966 | 0.01657 | 0.01115 | ... | 12.98 | 25.72 | 82.98 | 516.5 | 0.10850 | 0.08615 | 0.05523 | 0.03715 | 0.2433 | 0.06563 |
| 51 | 857373 | 0 | 13.640 | 16.34 | 87.21 | 571.8 | 0.07685 | 0.06059 | 0.01857 | 0.01723 | ... | 14.67 | 23.19 | 96.08 | 656.7 | 0.10890 | 0.15820 | 0.10500 | 0.08586 | 0.2346 | 0.08025 |
10 rows × 32 columns
In [4]:
Total_Rows = data.shape[0]
Total_Cols = data.shape[1]
print("Total Rows is :",Total_Rows)
print("Total columns is :", Total_Cols)
Total Rows is : 569 Total columns is : 33
In [5]:
print( "\n Information about the Titanic dataset : \n " )
data.info()
Information about the Titanic dataset : <class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 569 non-null int64 1 diagnosis 569 non-null object 2 radius_mean 569 non-null float64 3 texture_mean 569 non-null float64 4 perimeter_mean 569 non-null float64 5 area_mean 569 non-null float64 6 smoothness_mean 569 non-null float64 7 compactness_mean 569 non-null float64 8 concavity_mean 569 non-null float64 9 concave points_mean 569 non-null float64 10 symmetry_mean 569 non-null float64 11 fractal_dimension_mean 569 non-null float64 12 radius_se 569 non-null float64 13 texture_se 569 non-null float64 14 perimeter_se 569 non-null float64 15 area_se 569 non-null float64 16 smoothness_se 569 non-null float64 17 compactness_se 569 non-null float64 18 concavity_se 569 non-null float64 19 concave points_se 569 non-null float64 20 symmetry_se 569 non-null float64 21 fractal_dimension_se 569 non-null float64 22 radius_worst 569 non-null float64 23 texture_worst 569 non-null float64 24 perimeter_worst 569 non-null float64 25 area_worst 569 non-null float64 26 smoothness_worst 569 non-null float64 27 compactness_worst 569 non-null float64 28 concavity_worst 569 non-null float64 29 concave points_worst 569 non-null float64 30 symmetry_worst 569 non-null float64 31 fractal_dimension_worst 569 non-null float64 32 Unnamed: 32 0 non-null float64 dtypes: float64(31), int64(1), object(1) memory usage: 146.8+ KB
In [6]:
print("duplicate values is : ",data.duplicated().sum())
duplicate values is : 0
In [ ]:
Columns distribution¶
In [7]:
print("Total columns is here :\n", data.columns.tolist())
Total columns is here : ['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32']
In [8]:
numerical_col = data.select_dtypes(include=["int64","float64"]).columns
print("Total Numerical columns list is here :\n", numerical_col)
print("\nTotal Numerical columns is here :\n" ,numerical_col.value_counts().sum())
Total Numerical columns list is here :
Index(['id', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
'smoothness_mean', 'compactness_mean', 'concavity_mean',
'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
'fractal_dimension_se', 'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave points_worst',
'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
dtype='object')
Total Numerical columns is here :
32
In [9]:
categorical_col = data.select_dtypes(include=["O"]).columns
print("Total categorical columns list is here :\n", categorical_col)
print("\nTotal categorical columns is here :\n" ,categorical_col.value_counts().sum())
Total categorical columns list is here : Index(['diagnosis'], dtype='object') Total categorical columns is here : 1
In [ ]:
basic statistics about the Breast Cancer data :¶
In [10]:
data.describe().T
Out[10]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| id | 569.0 | 3.037183e+07 | 1.250206e+08 | 8670.000000 | 869218.000000 | 906024.000000 | 8.813129e+06 | 9.113205e+08 |
| radius_mean | 569.0 | 1.412729e+01 | 3.524049e+00 | 6.981000 | 11.700000 | 13.370000 | 1.578000e+01 | 2.811000e+01 |
| texture_mean | 569.0 | 1.928965e+01 | 4.301036e+00 | 9.710000 | 16.170000 | 18.840000 | 2.180000e+01 | 3.928000e+01 |
| perimeter_mean | 569.0 | 9.196903e+01 | 2.429898e+01 | 43.790000 | 75.170000 | 86.240000 | 1.041000e+02 | 1.885000e+02 |
| area_mean | 569.0 | 6.548891e+02 | 3.519141e+02 | 143.500000 | 420.300000 | 551.100000 | 7.827000e+02 | 2.501000e+03 |
| smoothness_mean | 569.0 | 9.636028e-02 | 1.406413e-02 | 0.052630 | 0.086370 | 0.095870 | 1.053000e-01 | 1.634000e-01 |
| compactness_mean | 569.0 | 1.043410e-01 | 5.281276e-02 | 0.019380 | 0.064920 | 0.092630 | 1.304000e-01 | 3.454000e-01 |
| concavity_mean | 569.0 | 8.879932e-02 | 7.971981e-02 | 0.000000 | 0.029560 | 0.061540 | 1.307000e-01 | 4.268000e-01 |
| concave points_mean | 569.0 | 4.891915e-02 | 3.880284e-02 | 0.000000 | 0.020310 | 0.033500 | 7.400000e-02 | 2.012000e-01 |
| symmetry_mean | 569.0 | 1.811619e-01 | 2.741428e-02 | 0.106000 | 0.161900 | 0.179200 | 1.957000e-01 | 3.040000e-01 |
| fractal_dimension_mean | 569.0 | 6.279761e-02 | 7.060363e-03 | 0.049960 | 0.057700 | 0.061540 | 6.612000e-02 | 9.744000e-02 |
| radius_se | 569.0 | 4.051721e-01 | 2.773127e-01 | 0.111500 | 0.232400 | 0.324200 | 4.789000e-01 | 2.873000e+00 |
| texture_se | 569.0 | 1.216853e+00 | 5.516484e-01 | 0.360200 | 0.833900 | 1.108000 | 1.474000e+00 | 4.885000e+00 |
| perimeter_se | 569.0 | 2.866059e+00 | 2.021855e+00 | 0.757000 | 1.606000 | 2.287000 | 3.357000e+00 | 2.198000e+01 |
| area_se | 569.0 | 4.033708e+01 | 4.549101e+01 | 6.802000 | 17.850000 | 24.530000 | 4.519000e+01 | 5.422000e+02 |
| smoothness_se | 569.0 | 7.040979e-03 | 3.002518e-03 | 0.001713 | 0.005169 | 0.006380 | 8.146000e-03 | 3.113000e-02 |
| compactness_se | 569.0 | 2.547814e-02 | 1.790818e-02 | 0.002252 | 0.013080 | 0.020450 | 3.245000e-02 | 1.354000e-01 |
| concavity_se | 569.0 | 3.189372e-02 | 3.018606e-02 | 0.000000 | 0.015090 | 0.025890 | 4.205000e-02 | 3.960000e-01 |
| concave points_se | 569.0 | 1.179614e-02 | 6.170285e-03 | 0.000000 | 0.007638 | 0.010930 | 1.471000e-02 | 5.279000e-02 |
| symmetry_se | 569.0 | 2.054230e-02 | 8.266372e-03 | 0.007882 | 0.015160 | 0.018730 | 2.348000e-02 | 7.895000e-02 |
| fractal_dimension_se | 569.0 | 3.794904e-03 | 2.646071e-03 | 0.000895 | 0.002248 | 0.003187 | 4.558000e-03 | 2.984000e-02 |
| radius_worst | 569.0 | 1.626919e+01 | 4.833242e+00 | 7.930000 | 13.010000 | 14.970000 | 1.879000e+01 | 3.604000e+01 |
| texture_worst | 569.0 | 2.567722e+01 | 6.146258e+00 | 12.020000 | 21.080000 | 25.410000 | 2.972000e+01 | 4.954000e+01 |
| perimeter_worst | 569.0 | 1.072612e+02 | 3.360254e+01 | 50.410000 | 84.110000 | 97.660000 | 1.254000e+02 | 2.512000e+02 |
| area_worst | 569.0 | 8.805831e+02 | 5.693570e+02 | 185.200000 | 515.300000 | 686.500000 | 1.084000e+03 | 4.254000e+03 |
| smoothness_worst | 569.0 | 1.323686e-01 | 2.283243e-02 | 0.071170 | 0.116600 | 0.131300 | 1.460000e-01 | 2.226000e-01 |
| compactness_worst | 569.0 | 2.542650e-01 | 1.573365e-01 | 0.027290 | 0.147200 | 0.211900 | 3.391000e-01 | 1.058000e+00 |
| concavity_worst | 569.0 | 2.721885e-01 | 2.086243e-01 | 0.000000 | 0.114500 | 0.226700 | 3.829000e-01 | 1.252000e+00 |
| concave points_worst | 569.0 | 1.146062e-01 | 6.573234e-02 | 0.000000 | 0.064930 | 0.099930 | 1.614000e-01 | 2.910000e-01 |
| symmetry_worst | 569.0 | 2.900756e-01 | 6.186747e-02 | 0.156500 | 0.250400 | 0.282200 | 3.179000e-01 | 6.638000e-01 |
| fractal_dimension_worst | 569.0 | 8.394582e-02 | 1.806127e-02 | 0.055040 | 0.071460 | 0.080040 | 9.208000e-02 | 2.075000e-01 |
| Unnamed: 32 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
In [11]:
data.describe(include="all").T
Out[11]:
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| id | 569.0 | NaN | NaN | NaN | 30371831.432337 | 125020585.612224 | 8670.0 | 869218.0 | 906024.0 | 8813129.0 | 911320502.0 |
| diagnosis | 569 | 2 | B | 357 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| radius_mean | 569.0 | NaN | NaN | NaN | 14.127292 | 3.524049 | 6.981 | 11.7 | 13.37 | 15.78 | 28.11 |
| texture_mean | 569.0 | NaN | NaN | NaN | 19.289649 | 4.301036 | 9.71 | 16.17 | 18.84 | 21.8 | 39.28 |
| perimeter_mean | 569.0 | NaN | NaN | NaN | 91.969033 | 24.298981 | 43.79 | 75.17 | 86.24 | 104.1 | 188.5 |
| area_mean | 569.0 | NaN | NaN | NaN | 654.889104 | 351.914129 | 143.5 | 420.3 | 551.1 | 782.7 | 2501.0 |
| smoothness_mean | 569.0 | NaN | NaN | NaN | 0.09636 | 0.014064 | 0.05263 | 0.08637 | 0.09587 | 0.1053 | 0.1634 |
| compactness_mean | 569.0 | NaN | NaN | NaN | 0.104341 | 0.052813 | 0.01938 | 0.06492 | 0.09263 | 0.1304 | 0.3454 |
| concavity_mean | 569.0 | NaN | NaN | NaN | 0.088799 | 0.07972 | 0.0 | 0.02956 | 0.06154 | 0.1307 | 0.4268 |
| concave points_mean | 569.0 | NaN | NaN | NaN | 0.048919 | 0.038803 | 0.0 | 0.02031 | 0.0335 | 0.074 | 0.2012 |
| symmetry_mean | 569.0 | NaN | NaN | NaN | 0.181162 | 0.027414 | 0.106 | 0.1619 | 0.1792 | 0.1957 | 0.304 |
| fractal_dimension_mean | 569.0 | NaN | NaN | NaN | 0.062798 | 0.00706 | 0.04996 | 0.0577 | 0.06154 | 0.06612 | 0.09744 |
| radius_se | 569.0 | NaN | NaN | NaN | 0.405172 | 0.277313 | 0.1115 | 0.2324 | 0.3242 | 0.4789 | 2.873 |
| texture_se | 569.0 | NaN | NaN | NaN | 1.216853 | 0.551648 | 0.3602 | 0.8339 | 1.108 | 1.474 | 4.885 |
| perimeter_se | 569.0 | NaN | NaN | NaN | 2.866059 | 2.021855 | 0.757 | 1.606 | 2.287 | 3.357 | 21.98 |
| area_se | 569.0 | NaN | NaN | NaN | 40.337079 | 45.491006 | 6.802 | 17.85 | 24.53 | 45.19 | 542.2 |
| smoothness_se | 569.0 | NaN | NaN | NaN | 0.007041 | 0.003003 | 0.001713 | 0.005169 | 0.00638 | 0.008146 | 0.03113 |
| compactness_se | 569.0 | NaN | NaN | NaN | 0.025478 | 0.017908 | 0.002252 | 0.01308 | 0.02045 | 0.03245 | 0.1354 |
| concavity_se | 569.0 | NaN | NaN | NaN | 0.031894 | 0.030186 | 0.0 | 0.01509 | 0.02589 | 0.04205 | 0.396 |
| concave points_se | 569.0 | NaN | NaN | NaN | 0.011796 | 0.00617 | 0.0 | 0.007638 | 0.01093 | 0.01471 | 0.05279 |
| symmetry_se | 569.0 | NaN | NaN | NaN | 0.020542 | 0.008266 | 0.007882 | 0.01516 | 0.01873 | 0.02348 | 0.07895 |
| fractal_dimension_se | 569.0 | NaN | NaN | NaN | 0.003795 | 0.002646 | 0.000895 | 0.002248 | 0.003187 | 0.004558 | 0.02984 |
| radius_worst | 569.0 | NaN | NaN | NaN | 16.26919 | 4.833242 | 7.93 | 13.01 | 14.97 | 18.79 | 36.04 |
| texture_worst | 569.0 | NaN | NaN | NaN | 25.677223 | 6.146258 | 12.02 | 21.08 | 25.41 | 29.72 | 49.54 |
| perimeter_worst | 569.0 | NaN | NaN | NaN | 107.261213 | 33.602542 | 50.41 | 84.11 | 97.66 | 125.4 | 251.2 |
| area_worst | 569.0 | NaN | NaN | NaN | 880.583128 | 569.356993 | 185.2 | 515.3 | 686.5 | 1084.0 | 4254.0 |
| smoothness_worst | 569.0 | NaN | NaN | NaN | 0.132369 | 0.022832 | 0.07117 | 0.1166 | 0.1313 | 0.146 | 0.2226 |
| compactness_worst | 569.0 | NaN | NaN | NaN | 0.254265 | 0.157336 | 0.02729 | 0.1472 | 0.2119 | 0.3391 | 1.058 |
| concavity_worst | 569.0 | NaN | NaN | NaN | 0.272188 | 0.208624 | 0.0 | 0.1145 | 0.2267 | 0.3829 | 1.252 |
| concave points_worst | 569.0 | NaN | NaN | NaN | 0.114606 | 0.065732 | 0.0 | 0.06493 | 0.09993 | 0.1614 | 0.291 |
| symmetry_worst | 569.0 | NaN | NaN | NaN | 0.290076 | 0.061867 | 0.1565 | 0.2504 | 0.2822 | 0.3179 | 0.6638 |
| fractal_dimension_worst | 569.0 | NaN | NaN | NaN | 0.083946 | 0.018061 | 0.05504 | 0.07146 | 0.08004 | 0.09208 | 0.2075 |
| Unnamed: 32 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
In [12]:
data.describe(include="O").T
Out[12]:
| count | unique | top | freq | |
|---|---|---|---|---|
| diagnosis | 569 | 2 | B | 357 |
In [ ]:
Visualizations:¶
In [13]:
import matplotlib.pyplot as plt
import seaborn as sns
data.hist(numerical_col, figsize=(40,20))
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()
In [38]:
sns.pairplot(data)
plt.tight_layout()
plt.show()